/******************************************************************************
 * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include <cub/device/device_memcpy.cuh>
#include <cub/iterator/transform_input_iterator.cuh>
#include <cub/util_ptx.cuh>

#include <thrust/device_vector.h>
#include <thrust/fill.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/logical.h>
#include <thrust/sequence.h>

#include <algorithm>
#include <cstdint>
#include <limits>
#include <numeric>
#include <random>
#include <type_traits>
#include <vector>

#include "test_util.h"

/**
 * @brief Host-side random data generation
 */
template <typename T>
void GenerateRandomData(
  T *rand_out,
  const std::size_t num_items,
  const T min_rand_val          = std::numeric_limits<T>::min(),
  const T max_rand_val          = std::numeric_limits<T>::max(),
  const std::uint_fast32_t seed = 320981U,
  typename std::enable_if<std::is_integral<T>::value && (sizeof(T) >= 2)>::type * = nullptr)
{
  // initialize random number generator
  std::mt19937 rng(seed);
  std::uniform_int_distribution<T> uni_dist(min_rand_val, max_rand_val);

  // generate random numbers
  for (std::size_t i = 0; i < num_items; ++i)
  {
    rand_out[i] = uni_dist(rng);
  }
}

template <typename InputBufferIt,
          typename OutputBufferIt,
          typename BufferSizeIteratorT,
          typename BufferOffsetT>
void __global__ BaselineBatchMemCpyKernel(InputBufferIt input_buffer_it,
                                          OutputBufferIt output_buffer_it,
                                          BufferSizeIteratorT buffer_sizes,
                                          BufferOffsetT num_buffers)
{
  BufferOffsetT gtid = blockDim.x * blockIdx.x + threadIdx.x;
  if (gtid >= num_buffers)
  {
    return;
  }
  for (BufferOffsetT i = 0; i < buffer_sizes[gtid]; i++)
  {
    reinterpret_cast<uint8_t *>(output_buffer_it[gtid])[i] =
      reinterpret_cast<uint8_t *>(input_buffer_it[gtid])[i];
  }
}

template <typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT>
void InvokeBaselineBatchMemcpy(InputBufferIt input_buffer_it,
                               OutputBufferIt output_buffer_it,
                               BufferSizeIteratorT buffer_sizes,
                               uint32_t num_buffers)
{
  constexpr uint32_t block_threads = 128U;
  uint32_t num_blocks              = (num_buffers + block_threads - 1) / block_threads;
  BaselineBatchMemCpyKernel<<<num_blocks, block_threads>>>(input_buffer_it,
                                                           output_buffer_it,
                                                           buffer_sizes,
                                                           num_buffers);
}

template <typename InputBufferIt,
          typename OutputBufferIt,
          typename BufferSizeIteratorT,
          typename BufferOffsetT>
void __global__ BaselineBatchMemCpyPerBlockKernel(InputBufferIt input_buffer_it,
                                                  OutputBufferIt output_buffer_it,
                                                  BufferSizeIteratorT buffer_sizes,
                                                  BufferOffsetT num_buffers)
{
  BufferOffsetT gbid = blockIdx.x;
  if (gbid >= num_buffers)
  {
    return;
  }
  for (BufferOffsetT i = threadIdx.x; i < buffer_sizes[gbid] / 8; i += blockDim.x)
  {
    reinterpret_cast<uint64_t *>(output_buffer_it[gbid])[i] =
      reinterpret_cast<uint64_t *>(input_buffer_it[gbid])[i];
  }
}

/**
 * @brief Used for generating a shuffled but cohesive sequence of output-buffer offsets for the
 * sequence of input-buffers.
 */
template <typename BufferOffsetT, typename ByteOffsetT, typename BufferSizeT>
std::vector<ByteOffsetT> GetShuffledBufferOffsets(const std::vector<BufferSizeT> &buffer_sizes,
                                                  const std::uint_fast32_t seed = 320981U)
{
  BufferOffsetT num_buffers = static_cast<BufferOffsetT>(buffer_sizes.size());

  // We're remapping the i-th buffer to pmt_idxs[i]
  std::mt19937 rng(seed);
  std::vector<BufferOffsetT> pmt_idxs(num_buffers);
  std::iota(pmt_idxs.begin(), pmt_idxs.end(), static_cast<BufferOffsetT>(0));
  std::shuffle(std::begin(pmt_idxs), std::end(pmt_idxs), rng);

  // Compute the offsets using the new mapping
  ByteOffsetT running_offset = {};
  std::vector<ByteOffsetT> permuted_offsets;
  permuted_offsets.reserve(num_buffers);
  for (auto permuted_buffer_idx : pmt_idxs)
  {
    permuted_offsets.emplace_back(running_offset);
    running_offset += buffer_sizes[permuted_buffer_idx];
  }

  // Generate the scatter indexes that identify where each buffer was mapped to
  std::vector<BufferOffsetT> scatter_idxs(num_buffers);
  for (BufferOffsetT i = 0; i < num_buffers; i++)
  {
    scatter_idxs[pmt_idxs[i]] = i;
  }

  std::vector<ByteOffsetT> new_offsets(num_buffers);
  for (BufferOffsetT i = 0; i < num_buffers; i++)
  {
    new_offsets[i] = permuted_offsets[scatter_idxs[i]];
  }

  return new_offsets;
}

/**
 * @brief Function object class template that takes an offset and returns an iterator at the given
 * offset relative to a fixed base iterator.
 *
 * @tparam IteratorT The random-access iterator type to be returned
 */
template <typename IteratorT>
struct OffsetToPtrOp
{
  template <typename T>
  __host__ __device__ __forceinline__ IteratorT operator()(T offset) const
  {
    return base_it + offset;
  }
  IteratorT base_it;
};

enum class TestDataGen
{
  // Random offsets into a data segment
  RANDOM,

  // Buffers cohesively reside next to each other
  CONSECUTIVE
};

/**
 * @brief
 *
 * @tparam AtomicT The most granular type being copied. All source and destination pointers will be
 * aligned based on this type, the number of bytes being copied will be an integer multiple of this
 * type's size
 * @tparam BufferOffsetT Type used for indexing into the array of buffers
 * @tparam BufferSizeT Type used for indexing into individual bytes of a buffer (large enough to
 * cover the max buffer size)
 * @tparam ByteOffsetT Type used for indexing into bytes over *all* the buffers' sizes
 */
template <typename AtomicT, typename BufferOffsetT, typename BufferSizeT, typename ByteOffsetT>
void RunTest(BufferOffsetT num_buffers,
             BufferSizeT min_buffer_size,
             BufferSizeT max_buffer_size,
             TestDataGen input_gen,
             TestDataGen output_gen)
{
  using SrcPtrT = uint8_t *;

  // Buffer segment data (their offsets and sizes)
  std::vector<BufferSizeT> h_buffer_sizes(num_buffers);
  std::vector<ByteOffsetT> h_buffer_src_offsets(num_buffers);
  std::vector<ByteOffsetT> h_buffer_dst_offsets(num_buffers);

  // Device-side resources
  void *d_in                        = nullptr;
  void *d_out                       = nullptr;
  ByteOffsetT *d_buffer_src_offsets = nullptr;
  ByteOffsetT *d_buffer_dst_offsets = nullptr;
  BufferSizeT *d_buffer_sizes       = nullptr;
  void *d_temp_storage              = nullptr;
  size_t temp_storage_bytes         = 0;

  // Generate the buffer sizes
  GenerateRandomData(h_buffer_sizes.data(), h_buffer_sizes.size(), min_buffer_size, max_buffer_size);

  // Make sure buffer sizes are a multiple of the most granular unit (one AtomicT) being copied
  // (round down)
  for (BufferOffsetT i = 0; i < num_buffers; i++)
  {
    h_buffer_sizes[i] = (h_buffer_sizes[i] / sizeof(AtomicT)) * sizeof(AtomicT);
  }

  // Compute the total bytes to be copied
  ByteOffsetT num_total_bytes = 0;
  for (BufferOffsetT i = 0; i < num_buffers; i++)
  {
    if (input_gen == TestDataGen::CONSECUTIVE)
    {
      h_buffer_src_offsets[i] = num_total_bytes;
    }
    if (output_gen == TestDataGen::CONSECUTIVE)
    {
      h_buffer_dst_offsets[i] = num_total_bytes;
    }
    num_total_bytes += h_buffer_sizes[i];
  }

  // Shuffle input buffer source-offsets
  std::uint_fast32_t shuffle_seed = 320981U;
  if (input_gen == TestDataGen::RANDOM)
  {
    h_buffer_src_offsets = GetShuffledBufferOffsets<BufferOffsetT, ByteOffsetT>(h_buffer_sizes,
                                                                                shuffle_seed);
    shuffle_seed += 42;
  }

  // Shuffle input buffer source-offsets
  if (output_gen == TestDataGen::RANDOM)
  {
    h_buffer_dst_offsets = GetShuffledBufferOffsets<BufferOffsetT, ByteOffsetT>(h_buffer_sizes,
                                                                                shuffle_seed);
  }

  // Get temporary storage requirements
  CubDebugExit(cub::DeviceMemcpy::Batched(d_temp_storage,
                                          temp_storage_bytes,
                                          static_cast<SrcPtrT *>(nullptr),
                                          static_cast<SrcPtrT *>(nullptr),
                                          d_buffer_sizes,
                                          num_buffers));

  // Check if there's sufficient device memory to run this test
  std::size_t total_required_mem = num_total_bytes +                                 //
                                   num_total_bytes +                                 //
                                   (num_buffers * sizeof(d_buffer_src_offsets[0])) + //
                                   (num_buffers * sizeof(d_buffer_dst_offsets[0])) + //
                                   (num_buffers * sizeof(d_buffer_sizes[0])) +       //
                                   temp_storage_bytes;                               //
  if (TotalGlobalMem() < total_required_mem)
  {
    std::cout
      << "Skipping the test due to insufficient device memory\n"                                  //
      << " - Required: " << total_required_mem << " B, available: " << TotalGlobalMem() << " B\n" //
      << " - Skipped test instance: "                                                             //
      << " -> Min. buffer size: " << min_buffer_size << ", max. buffer size: " << max_buffer_size //
      << ", num_buffers: " << num_buffers                                                         //
      << ", in_gen: " << ((input_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE")            //
      << ", out_gen: " << ((output_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE");
    return;
  }

  cudaEvent_t events[2];
  cudaEventCreate(&events[0]);
  cudaEventCreate(&events[1]);

  cudaStream_t stream;
  cudaStreamCreate(&stream);

  // Allocate device memory
  CubDebugExit(cudaMalloc(&d_in, num_total_bytes));
  CubDebugExit(cudaMalloc(&d_out, num_total_bytes));
  CubDebugExit(cudaMalloc(&d_buffer_src_offsets, num_buffers * sizeof(d_buffer_src_offsets[0])));
  CubDebugExit(cudaMalloc(&d_buffer_dst_offsets, num_buffers * sizeof(d_buffer_dst_offsets[0])));
  CubDebugExit(cudaMalloc(&d_buffer_sizes, num_buffers * sizeof(d_buffer_sizes[0])));
  CubDebugExit(cudaMalloc(&d_temp_storage, temp_storage_bytes));

  // Populate the data source with random data
  using RandomInitAliasT         = uint16_t;
  std::size_t num_aliased_factor = sizeof(RandomInitAliasT) / sizeof(uint8_t);
  std::size_t num_aliased_units  = CUB_QUOTIENT_CEILING(num_total_bytes, num_aliased_factor);
  std::unique_ptr<uint8_t[]> h_in(new uint8_t[num_aliased_units * num_aliased_factor]);
  std::unique_ptr<uint8_t[]> h_out(new uint8_t[num_total_bytes]);
  std::unique_ptr<uint8_t[]> h_gpu_results(new uint8_t[num_total_bytes]);

  // Generate random offsets into the random-bits data buffer
  GenerateRandomData(reinterpret_cast<RandomInitAliasT *>(h_in.get()), num_aliased_units);

  // Prepare d_buffer_srcs
  OffsetToPtrOp<SrcPtrT> src_transform_op{static_cast<SrcPtrT>(d_in)};
  cub::TransformInputIterator<SrcPtrT, OffsetToPtrOp<SrcPtrT>, ByteOffsetT *> d_buffer_srcs(
    d_buffer_src_offsets,
    src_transform_op);

  // Prepare d_buffer_dsts
  OffsetToPtrOp<SrcPtrT> dst_transform_op{static_cast<SrcPtrT>(d_out)};
  cub::TransformInputIterator<SrcPtrT, OffsetToPtrOp<SrcPtrT>, ByteOffsetT *> d_buffer_dsts(
    d_buffer_dst_offsets,
    dst_transform_op);

  // Prepare random data segment (which serves for the buffer sources)
  CubDebugExit(cudaMemcpyAsync(d_in, h_in.get(), num_total_bytes, cudaMemcpyHostToDevice, stream));

  // Prepare d_buffer_src_offsets
  CubDebugExit(cudaMemcpyAsync(d_buffer_src_offsets,
                               h_buffer_src_offsets.data(),
                               h_buffer_src_offsets.size() * sizeof(h_buffer_src_offsets[0]),
                               cudaMemcpyHostToDevice,
                               stream));

  // Prepare d_buffer_dst_offsets
  CubDebugExit(cudaMemcpyAsync(d_buffer_dst_offsets,
                               h_buffer_dst_offsets.data(),
                               h_buffer_dst_offsets.size() * sizeof(h_buffer_dst_offsets[0]),
                               cudaMemcpyHostToDevice,
                               stream));

  // Prepare d_buffer_sizes
  CubDebugExit(cudaMemcpyAsync(d_buffer_sizes,
                               h_buffer_sizes.data(),
                               h_buffer_sizes.size() * sizeof(h_buffer_sizes[0]),
                               cudaMemcpyHostToDevice,
                               stream));

  // Record event before algorithm
  cudaEventRecord(events[0], stream);

  // Invoke device-side algorithm being under test
  CubDebugExit(cub::DeviceMemcpy::Batched(d_temp_storage,
                                          temp_storage_bytes,
                                          d_buffer_srcs,
                                          d_buffer_dsts,
                                          d_buffer_sizes,
                                          num_buffers,
                                          stream));

  // Record event after algorithm
  cudaEventRecord(events[1], stream);

  // Copy back the output buffer
  CubDebugExit(
    cudaMemcpyAsync(h_gpu_results.get(), d_out, num_total_bytes, cudaMemcpyDeviceToHost, stream));

  // Make sure results have been copied back to the host
  CubDebugExit(cudaStreamSynchronize(stream));

  // CPU-side result generation for verification
  for (BufferOffsetT i = 0; i < num_buffers; i++)
  {
    std::memcpy(h_out.get() + h_buffer_dst_offsets[i],
                h_in.get() + h_buffer_src_offsets[i],
                h_buffer_sizes[i]);
  }

  float duration = 0;
  cudaEventElapsedTime(&duration, events[0], events[1]);

#ifdef CUB_TEST_BENCHMARK
  size_t stats_src_offsets = sizeof(ByteOffsetT) * num_buffers;
  size_t stats_dst_offsets = sizeof(ByteOffsetT) * num_buffers;
  size_t stats_sizes       = sizeof(BufferSizeT) * num_buffers;
  size_t stats_data_copied = 2 * num_total_bytes;

  std::cout
    << "Min. buffer size: " << min_buffer_size << ", max. buffer size: " << max_buffer_size     //
    << ", num_buffers: " << num_buffers                                                         //
    << ", in_gen: " << ((input_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE")            //
    << ", out_gen: " << ((output_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE")          //
    << ", src size: " << stats_src_offsets << ", dst size: " << stats_dst_offsets               //
    << ", sizes size: " << stats_sizes << ", cpy_data_size: " << stats_data_copied              //
    << ", total: " << (stats_src_offsets + stats_dst_offsets + stats_sizes + stats_data_copied) //
    << ", duration: " << duration                                                               //
    << ", BW: "
    << ((double)(stats_src_offsets + stats_dst_offsets + stats_sizes + stats_data_copied) /
        1000000000.0) /
         (duration / 1000.0)
    << "GB/s \n";
#endif

  for (ByteOffsetT i = 0; i < num_total_bytes; i++)
  {
    if (h_gpu_results.get()[i] != h_out.get()[i])
    {
      std::cout << "Mismatch at index " << i
                << ", CPU vs. GPU: " << static_cast<uint16_t>(h_gpu_results.get()[i]) << ", "
                << static_cast<uint16_t>(h_out.get()[i]) << "\n";
    }
    AssertEquals(h_out.get()[i], h_gpu_results.get()[i]);
  }

  CubDebugExit(cudaFree(d_in));
  CubDebugExit(cudaFree(d_out));
  CubDebugExit(cudaFree(d_buffer_src_offsets));
  CubDebugExit(cudaFree(d_buffer_dst_offsets));
  CubDebugExit(cudaFree(d_buffer_sizes));
  CubDebugExit(cudaFree(d_temp_storage));
}

template <int LOGICAL_WARP_SIZE, typename VectorT, typename ByteOffsetT>
__global__ void TestVectorizedCopyKernel(const void *d_in, void *d_out, ByteOffsetT copy_size)
{
  cub::detail::VectorizedCopy<LOGICAL_WARP_SIZE, VectorT>(threadIdx.x, d_out, copy_size, d_in);
}

struct TupleMemberEqualityOp
{
  template <typename T>
  __host__ __device__ __forceinline__ bool operator()(T tuple)
  {
    return thrust::get<0>(tuple) == thrust::get<1>(tuple);
  }
};

/**
 * @brief Tests the VectorizedCopy for various aligned and misaligned input and output pointers.
 * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
 */
template <typename VectorT>
void TestVectorizedCopy()
{

  constexpr uint32_t threads_per_block = 8;

  std::vector<std::size_t> in_offsets{0, 1, sizeof(uint32_t) - 1};
  std::vector<std::size_t> out_offsets{0, 1, sizeof(VectorT) - 1};
  std::vector<std::size_t> copy_sizes{0,
                                      1,
                                      sizeof(uint32_t),
                                      sizeof(VectorT),
                                      2 * threads_per_block * sizeof(VectorT)};
  for (auto copy_sizes_it = std::begin(copy_sizes); copy_sizes_it < std::end(copy_sizes);
       copy_sizes_it++)
  {
    for (auto in_offsets_it = std::begin(in_offsets); in_offsets_it < std::end(in_offsets);
         in_offsets_it++)
    {
      for (auto out_offsets_it = std::begin(out_offsets); out_offsets_it < std::end(out_offsets);
           out_offsets_it++)
      {
        std::size_t in_offset  = *in_offsets_it;
        std::size_t out_offset = *out_offsets_it;
        std::size_t copy_size  = *copy_sizes_it;

        // Prepare data
        const std::size_t alloc_size_in  = in_offset + copy_size;
        const std::size_t alloc_size_out = out_offset + copy_size;
        thrust::device_vector<char> data_in(alloc_size_in);
        thrust::device_vector<char> data_out(alloc_size_out);
        thrust::sequence(data_in.begin(), data_in.end(), static_cast<char>(0));
        thrust::fill_n(data_out.begin(), alloc_size_out, static_cast<char>(0x42));

        auto d_in  = thrust::raw_pointer_cast(data_in.data());
        auto d_out = thrust::raw_pointer_cast(data_out.data());

        TestVectorizedCopyKernel<threads_per_block, VectorT>
          <<<1, threads_per_block>>>(d_in + in_offset,
                                     d_out + out_offset,
                                     static_cast<int>(copy_size));
        auto zip_it = thrust::make_zip_iterator(data_in.begin() + in_offset,
                                                data_out.begin() + out_offset);

        bool success = thrust::all_of(zip_it, zip_it + copy_size, TupleMemberEqualityOp{});
        AssertTrue(success);
      }
    }
  }
}

template <uint32_t NUM_ITEMS, uint32_t MAX_ITEM_VALUE, bool PREFER_POW2_BITS>
__global__ void TestBitPackedCounterKernel(uint32_t *bins,
                                           uint32_t *increments,
                                           uint32_t *counts_out,
                                           uint32_t num_items)
{
  using BitPackedCounterT =
    cub::detail::BitPackedCounter<NUM_ITEMS, MAX_ITEM_VALUE, PREFER_POW2_BITS>;
  BitPackedCounterT counter{};
  for (uint32_t i = 0; i < num_items; i++)
  {
    counter.Add(bins[i], increments[i]);
  }

  for (uint32_t i = 0; i < NUM_ITEMS; i++)
  {
    counts_out[i] = counter.Get(i);
  }
}

/**
 * @brief Tests BitPackedCounter that's used for computing the histogram of buffer sizes (i.e.,
 * small, medium, large).
 */
template <uint32_t NUM_ITEMS, uint32_t MAX_ITEM_VALUE>
void TestBitPackedCounter(const std::uint_fast32_t seed = 320981U)
{

  constexpr uint32_t min_increment = 0;
  constexpr uint32_t max_increment = 4;
  constexpr double avg_increment   = static_cast<double>(min_increment) +
                                   (static_cast<double>(max_increment - min_increment) / 2.0);
  std::uint32_t num_increments = 
      static_cast<uint32_t>(static_cast<double>(MAX_ITEM_VALUE * NUM_ITEMS) / avg_increment);

  // Test input data
  std::array<uint64_t, NUM_ITEMS> reference_counters{};
  thrust::host_vector<uint32_t> h_bins(num_increments);
  thrust::host_vector<uint32_t> h_increments(num_increments);

  // Generate random test input data
  GenerateRandomData(thrust::raw_pointer_cast(h_bins.data()),
                     num_increments,
                     0U,
                     NUM_ITEMS - 1U,
                     seed);
  GenerateRandomData(thrust::raw_pointer_cast(h_increments.data()),
                     num_increments,
                     min_increment,
                     max_increment,
                     (seed + 17));

  // Make sure test data does not overflow any of the counters
  for (std::size_t i = 0; i < num_increments; i++)
  {
    // New increment for this bin would overflow => zero this increment
    if (reference_counters[h_bins[i]] + h_increments[i] >= MAX_ITEM_VALUE)
    {
      h_increments[i] = 0;
    }
    else
    {
      reference_counters[h_bins[i]] += h_increments[i];
    }
  }

  // Device memory
  thrust::device_vector<uint32_t> bins_in(num_increments);
  thrust::device_vector<uint32_t> increments_in(num_increments);
  thrust::device_vector<uint32_t> counts_out(NUM_ITEMS);

  // Initialize device-side test data
  bins_in       = h_bins;
  increments_in = h_increments;

  // Memory for GPU-generated results
  thrust::host_vector<uint32_t> host_counts(num_increments);

  // Reset counters to arbitrary random value
  thrust::fill(counts_out.begin(), counts_out.end(), 814920U);

  // Run tests with densely bit-packed counters
  TestBitPackedCounterKernel<NUM_ITEMS, MAX_ITEM_VALUE, false>
    <<<1, 1>>>(thrust::raw_pointer_cast(bins_in.data()),
               thrust::raw_pointer_cast(increments_in.data()),
               thrust::raw_pointer_cast(counts_out.data()),
               num_increments);

  // Result verification
  host_counts = counts_out;
  for (uint32_t i = 0; i < NUM_ITEMS; i++)
  {
    AssertEquals(reference_counters[i], host_counts[i]);
  }

  // Reset counters to arbitrary random value
  thrust::fill(counts_out.begin(), counts_out.end(), 814920U);

  // Run tests with bit-packed counters, where bit-count is a power-of-two
  TestBitPackedCounterKernel<NUM_ITEMS, MAX_ITEM_VALUE, true>
    <<<1, 1>>>(thrust::raw_pointer_cast(bins_in.data()),
               thrust::raw_pointer_cast(increments_in.data()),
               thrust::raw_pointer_cast(counts_out.data()),
               num_increments);

  // Result verification
  host_counts = counts_out;
  for (uint32_t i = 0; i < NUM_ITEMS; i++)
  {
    AssertEquals(reference_counters[i], host_counts[i]);
  }
}

int main(int argc, char **argv)
{
  CommandLineArgs args(argc, argv);

  // Initialize device
  CubDebugExit(args.DeviceInit());

  //---------------------------------------------------------------------
  // VectorizedCopy tests
  //---------------------------------------------------------------------
  TestVectorizedCopy<uint32_t>();
  TestVectorizedCopy<uint4>();

  //---------------------------------------------------------------------
  // BitPackedCounter tests
  //---------------------------------------------------------------------
  TestBitPackedCounter<1, 1>();
  TestBitPackedCounter<1, (0x01U << 16)>();
  TestBitPackedCounter<4, 1>();
  TestBitPackedCounter<4, 2>();
  TestBitPackedCounter<4, 255>();
  TestBitPackedCounter<4, 256>();
  TestBitPackedCounter<8, 1024>();
  TestBitPackedCounter<32, 1>();
  TestBitPackedCounter<32, 256>();

  //---------------------------------------------------------------------
  // DeviceMemcpy::Batched tests
  //---------------------------------------------------------------------
  // The most granular type being copied. Buffer's will be aligned and their size be an integer
  // multiple of this type
  using AtomicCopyT = uint8_t;

  // Type used for indexing into the array of buffers
  using BufferOffsetT = uint32_t;

  // Type used for indexing into individual bytes of a buffer (large enough to cover the max buffer
  using BufferSizeT = uint32_t;

  // Type used for indexing into bytes over *all* the buffers' sizes
  using ByteOffsetT = uint32_t;

  // Total number of bytes that are targeted to be copied on each run
  const BufferOffsetT target_copy_size = 64U << 20;

  // The number of randomly
  constexpr std::size_t num_rnd_buffer_range_tests = 32;

  // Each buffer's size will be random within this interval
  std::vector<std::pair<std::size_t, std::size_t>> buffer_size_ranges = {{0, 1},
                                                                         {1, 2},
                                                                         {0, 16},
                                                                         {1, 32},
                                                                         {1, 1024},
                                                                         {1, 32 * 1024},
                                                                         {128 * 1024, 256 * 1024},
                                                                         {target_copy_size,
                                                                          target_copy_size}};

  std::mt19937 rng(0);
  std::uniform_int_distribution<std::size_t> size_dist(1, 1000000);
  for (std::size_t i = 0; i < num_rnd_buffer_range_tests; i++)
  {
    auto range_begin = size_dist(rng);
    auto range_end   = size_dist(rng);
    if (range_begin > range_end)
    {
      std::swap(range_begin, range_end);
    }
    buffer_size_ranges.push_back({range_begin, range_end});
  }

  for (const auto &buffer_size_range : buffer_size_ranges)
  {
    BufferSizeT min_buffer_size =
      static_cast<BufferSizeT>(CUB_ROUND_UP_NEAREST(buffer_size_range.first, sizeof(AtomicCopyT)));
    BufferSizeT max_buffer_size =
      static_cast<BufferSizeT>(CUB_ROUND_UP_NEAREST(buffer_size_range.second,
                                                    static_cast<BufferSizeT>(sizeof(AtomicCopyT))));
    double average_buffer_size = (min_buffer_size + max_buffer_size) / 2.0;
    BufferOffsetT target_num_buffers =
      static_cast<BufferOffsetT>(target_copy_size / average_buffer_size);

    // Run tests with input buffer being consecutive and output buffers being consecutive
    RunTest<AtomicCopyT, BufferOffsetT, BufferSizeT, ByteOffsetT>(target_num_buffers,
                                                                  min_buffer_size,
                                                                  max_buffer_size,
                                                                  TestDataGen::CONSECUTIVE,
                                                                  TestDataGen::CONSECUTIVE);

    // Run tests with input buffer being randomly shuffled and output buffers being randomly
    // shuffled
    RunTest<AtomicCopyT, BufferOffsetT, BufferSizeT, ByteOffsetT>(target_num_buffers,
                                                                  min_buffer_size,
                                                                  max_buffer_size,
                                                                  TestDataGen::RANDOM,
                                                                  TestDataGen::RANDOM);
  }

  //---------------------------------------------------------------------
  // DeviceMemcpy::Batched test with 64-bit offsets
  //---------------------------------------------------------------------
  using ByteOffset64T = uint64_t;
  using BufferSize64T = uint64_t;
  ByteOffset64T large_target_copy_size =
    static_cast<ByteOffset64T>(std::numeric_limits<uint32_t>::max()) + (128ULL * 1024ULL * 1024ULL);
  // Make sure min_buffer_size is in fact smaller than max buffer size
  constexpr BufferOffsetT single_buffer = 1;

  // Run tests with input buffer being consecutive and output buffers being consecutive
  RunTest<AtomicCopyT, BufferOffsetT, BufferSize64T, ByteOffset64T>(single_buffer,
                                                                    large_target_copy_size,
                                                                    large_target_copy_size,
                                                                    TestDataGen::CONSECUTIVE,
                                                                    TestDataGen::CONSECUTIVE);
}
